Datierung¶

Das Notebook ergänzt den Anhang 'Methoden' und widmet sich der automatischen Datierung von Gedichten.

Import¶

In [1]:
import pandas as pd
import numpy as np
import re

import plotly.express as px

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.linear_model import LinearRegression, Ridge, ARDRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, StackingRegressor, VotingRegressor
from sklearn.neural_network import MLPRegressor

import itertools

Initial Setup¶

recreate meta¶

In [2]:
def read_metadata (path):
    metadata = pd.read_csv(path, sep=';', low_memory=False)
    
    new_header = metadata.iloc[0]
    metadata = metadata[2:]
    metadata.columns = new_header
    
    return metadata

def get_anthologies (data, RemoveVolume = False):
    anthologies = data.iloc[:,pos_id].values.tolist() # get ids   
    
    # delete .001 etc. (point - 3 digits - any non-word characters - end of word)
    anthologies = [re.sub("\\.[0-9][0-9][0-9][^A-Za-z]*$", "", x) for x in anthologies]

    if RemoveVolume : 
        anthologies = [re.sub("[0-9]$", "", x) for x in anthologies]
        anthologies = [re.sub("\\.$", "", x) for x in anthologies]
    
    return anthologies

def get_anthologies_years (data, always_get_year_of_first_ed = True):
    anthologies = get_anthologies(data)
    anthologies_years = []
        
    anthologies_years_first_ed = [re.findall("[0-9]{4}", x) for x in anthologies]
    anthologies_years_later_ed = [re.findall("\\([0-9]{4}\\)", x) for x in anthologies]
    
    for i in range(len(anthologies)):
        if always_get_year_of_first_ed or anthologies_years_later_ed[i] == []:
            anthologies_years.append(anthologies_years_first_ed[i][0])
        else:
            anthologies_years.append(re.findall("[0-9]{4}", anthologies_years_later_ed[i][0])[0])
    
    anthologies_years = [int(x) for x in anthologies_years]
    
    return anthologies_years
In [3]:
meta = pd.DataFrame()

pos_id = 0
pos_authors_names = 3
pos_lifetimes_birth = 6
pos_lifetimes_death = 7
pos_title_unified = 9
pos_text_written = 10
pos_text_published = 11

annotations = read_metadata("../resources/more/annotations.csv")

meta['id'] = annotations.iloc[:,pos_id].tolist()
meta['anthology'] = get_anthologies(annotations, RemoveVolume = True)
meta['anthology_with_volume'] = get_anthologies(annotations, RemoveVolume = False)
meta['anthology_year_first_ed'] = get_anthologies_years(annotations, always_get_year_of_first_ed = True)
meta['anthology_year_used_ed'] = get_anthologies_years(annotations, always_get_year_of_first_ed = False)
meta['corpus'] = ['add' if x == '1920.Pinthus' or x == '2022.GeschAddMod' else 'anth' for x in meta['anthology']]
meta['author'] = annotations.iloc[:,pos_authors_names].tolist()

author_birth = []
for x in annotations.iloc[:,pos_lifetimes_birth].tolist():
    try:
        author_birth.append(int(x[0:4]))
    except:
        author_birth.append(float('NaN'))
meta['author_birth'] = author_birth

author_death = []
for x in annotations.iloc[:,pos_lifetimes_death].tolist():
    try:
        author_death.append(int(x[0:4]))
    except:
        author_death.append(float('NaN'))
meta['author_death'] = author_death

meta['title'] = annotations.iloc[:,pos_title_unified].tolist()
meta['author_title'] = meta['author'] + ' – ' + meta['title']

get basic date data¶

In [4]:
def get_year_search_status (meta):
    written = meta.iloc[:,pos_text_written].tolist()
    published = meta.iloc[:,pos_text_published].tolist()
    
    results = []
    
    for this_written, this_published in zip(written, published):
        if any(char.isdigit() for char in str(this_written)) or any(char.isdigit() for char in str(this_published)):
            results.append('searched_and_found')
        elif '/' in str(this_written) or '/' in str(this_published):
            results.append('searched_but_not_found')
        else:
            results.append('not_searched')
            
    return results    

def get_written_and_published (meta, get_verified_only = False):    
    written = meta.iloc[:,pos_text_written].tolist()
    published = meta.iloc[:,pos_text_published].tolist()
    
    if get_verified_only:
        written = [x if str(x) != 'nan' and 'verified' in str(x) else float('NaN') for x in written]
        published = [x if str(x) != 'nan' and 'verified' in str(x) else float('NaN') for x in published]
     
    # convert to int
    written_int = []
    for x in written:
        try:
            x_clean = re.sub('\\.(.*)', '', re.sub('\\ (.*)', '', str(x)))
            written_int.append(int(x_clean))
        except:
            written_int.append(float('NaN'))

    published_int = []
    for x in published:
        x_clean = re.sub('\\.(.*)', '', re.sub('\\ (.*)', '', str(x)))
        try:
            published_int.append(int(x_clean))
        except:
            published_int.append(float('NaN'))
    
    return [written_int, published_int]

def get_years_gt (written, published):
    years_gt = []
    
    for i in range(len(written)):
        if str(written[i]) != 'nan': years_gt.append(written[i])
        elif str(published[i]) != 'nan': years_gt.append(published[i])
        else: years_gt.append(float('NaN'))
    
    return years_gt
In [5]:
meta['year_search_status'] = get_year_search_status(annotations)
meta['written_gt'] = get_written_and_published(annotations, get_verified_only = False)[0]
meta['published_gt'] = get_written_and_published(annotations, get_verified_only = False)[1]
meta['year_gt'] = get_years_gt(meta.written_gt.tolist(), meta.published_gt.tolist())
In [6]:
print(f"Manuell recherchierte Jahre : {meta['year_gt'].dropna().shape[0]}")
Manuell recherchierte Jahre : 3507

prepare train/test data¶

In [7]:
data = (
    meta
    .sort_values(by='year_gt', na_position='last')
    .drop_duplicates(subset="author_title")
    .sort_values(by='author_title')
    .reset_index(drop=True)
    .copy()
)

data = {
    'author_title' : data['author_title'].tolist(),
    'author_birth_year': data['author_birth'].tolist(),
    'author_death_year': data['author_death'].tolist(),
    # 'author_lifespan': (data['author_death']-data['author_birth']).tolist(),
    'first_anth_year': meta.groupby('author_title')['anthology_year_used_ed'].min().tolist(),
    'mean_anth_year': meta.groupby('author_title')['anthology_year_used_ed'].mean().tolist(),
    'last_anth_year': meta.groupby('author_title')['anthology_year_used_ed'].max().tolist(),
    'text_count' : meta.groupby('author_title').size().tolist(),
    'year_gt' : data['year_gt'].tolist()
}
data = pd.DataFrame(data)
data = data.sample(frac=1, random_state=0).reset_index(drop=True)
In [8]:
print(data.shape[0])
data.head()
10446
Out[8]:
author_title author_birth_year author_death_year first_anth_year mean_anth_year last_anth_year text_count year_gt
0 Maltiz, Friedrich Franz Apollonius – Schicksal... 1794.0 1857.0 1840 1869.900000 1909 10 NaN
1 Keiter, Therese – Die Königin 1859.0 1925.0 2022 2022.000000 2022 1 NaN
2 Kolmar, Gertrud – Marats Antlitz 1894.0 1943.0 2022 2022.000000 2022 1 1934.0
3 Wildenbruch, Ernst von – Dem Fürsten Bismarck 1845.0 1909.0 1903 1930.166667 1981 6 1890.0
4 Schollmeyer, Johann Georg – Fürsten-Größe im S... 1768.0 1839.0 1827 1827.000000 1827 1 NaN
In [9]:
possible_features = [
    'author_birth_year',
    'author_death_year', 
    # 'author_lifespan',
    'first_anth_year',
    'mean_anth_year',
    'last_anth_year',
    'text_count',
]
In [10]:
# data_traintest: texts where complete data (especially year_gt) is available
data_traintest = data.dropna(subset=possible_features + ['year_gt'])

print(data_traintest.shape[0])
data_traintest.head()
3395
Out[10]:
author_title author_birth_year author_death_year first_anth_year mean_anth_year last_anth_year text_count year_gt
2 Kolmar, Gertrud – Marats Antlitz 1894.0 1943.0 2022 2022.000000 2022 1 1934.0
3 Wildenbruch, Ernst von – Dem Fürsten Bismarck 1845.0 1909.0 1903 1930.166667 1981 6 1890.0
6 Hagenbach, Karl Rudolf – Das Feuerzeichen 1801.0 1874.0 1840 1864.875000 1891 8 1839.0
7 Miegel, Agnes – Jane 1874.0 1964.0 2022 2022.000000 2022 1 1905.0
9 Blomberg, Hugo von – Des alten Dessauers Gebet 1820.0 1871.0 1867 1885.000000 1898 4 1860.0
In [11]:
X = data_traintest[possible_features]
y = data_traintest['year_gt']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, shuffle=True)

print(X_train.shape[0])
print(X_test.shape[0])
2716
679
In [12]:
kf = KFold(
    n_splits=5, 
    shuffle=True, 
    random_state=1
)

Approach 1: ages mean¶

train/test¶

In [13]:
# cv
ages_means, mae, mse, r2 = [], [], [], []

for train_index, test_index in kf.split(data_traintest):
    data_train = data_traintest.iloc[train_index].copy()
    data_test = data_traintest.iloc[test_index].copy()

    ages_train = (data_train['year_gt']-data_train['author_birth_year']).dropna().tolist()
    ages_train_mean = np.mean(ages_train)

    data_test['year_predict_ages_mean'] = data_test['author_birth_year'] + ages_train_mean
    errors = data_test['year_predict_ages_mean'] - data_test['year_gt']
    errors = abs(errors.dropna())

    ages_means.append(ages_train_mean)
    mae.append(errors.mean())
    mse.append((errors*errors).mean())
    r2.append(r2_score(data_test['year_gt'], data_test['year_predict_ages_mean']))

print(f"CV Ages Mean             : {np.mean(ages_means)}")
print(f"CV Mean Absolute Error   : {np.mean(mae)}")
print(f"CV Mean Squared Error    : {np.mean(mse)}")
print(f"CV R2                    : {np.mean(r2)}")
CV Ages Mean             : 39.28718703976436
CV Mean Absolute Error   : 9.898268700614482
CV Mean Squared Error    : 147.47002896705504
CV R2                    : 0.8340278398369471
In [14]:
# test set
ages_train = (y_train-X_train['author_birth_year']).dropna().tolist()
ages_train_mean = np.mean(ages_train)

y_predict = X_test['author_birth_year'] + ages_train_mean
errors = y_predict-y_test
errors = abs(errors.dropna())

print(f"Ages Mean             : {ages_train_mean}")
print(f"Mean Absolute Error   : {errors.mean()}")
print(f"Mean Squared Error    : {(errors*errors).mean()}")
print(f"R2                    : {r2_score(y_test, y_predict)}")
Ages Mean             : 39.18262150220913
Mean Absolute Error   : 9.825601193820066
Mean Squared Error    : 148.14535149802308
R2                    : 0.8172553603318362

predict¶

In [15]:
ages = (data['year_gt']-data['author_birth_year']).dropna().tolist()
ages_mean = np.mean(ages)
In [16]:
meta['year_predict_ages_mean'] = meta['author_birth'] + ages_mean
In [17]:
print(ages_mean)
print(meta.query('year_gt.notna()')['year_predict_ages_mean'].mean())
39.2524609148813
1872.4391506296618

Approach 2: year middle¶

In [18]:
def get_year_middle (element, min_age = 18):
    try:
        year_min = element['author_birth_year']+min_age
        year_max = np.nanmin([element['author_death_year'], element['first_anth_year']])
        year_middle = (year_min + year_max)/2
        return year_middle
    except:
        return float('NaN')

train/test¶

In [19]:
# cv
mae, mse, r2 = [], [], []

for train_index, test_index in kf.split(data_traintest):
    data_train = data_traintest.iloc[train_index].copy()
    data_test = data_traintest.iloc[test_index].copy()
    
    ages_train = (data_train['year_gt']-data_train['author_birth_year']).dropna().tolist()
    ages_train_min = np.min(ages_train)

    data_test['year_predict_middle'] = [get_year_middle(element, min_age = ages_train_min) for element in data_test.iloc]
    errors = data_test['year_predict_middle'] - data_test['year_gt']
    errors = abs(errors.dropna())

    mae.append(errors.mean())
    mse.append((errors*errors).mean())
    r2.append(r2_score(data_test['year_gt'], data_test['year_predict_middle']))

print(f"CV Mean Absolute Error   : {np.mean(mae)}")
print(f"CV Mean Squared Error    : {np.mean(mse)}")
print(f"CV R2                    : {np.mean(r2)}")
CV Mean Absolute Error   : 9.270250368188513
CV Mean Squared Error    : 128.98549337260678
CV R2                    : 0.8545560243415633
In [20]:
# test set
ages_train = (y_train-X_train['author_birth_year']).dropna().tolist()
ages_train_min = np.min(ages_train)

y_predict = [get_year_middle(element, min_age = ages_train_min) for element in X_test.iloc]
errors = y_predict-y_test
errors = abs(errors.dropna())

print(f"Mean Absolute Error   : {errors.mean()}")
print(f"Mean Squared Error    : {(errors*errors).mean()}")
print(f"R2                    : {r2_score(y_test, y_predict)}")
Mean Absolute Error   : 9.66642120765832
Mean Squared Error    : 137.92746686303389
R2                    : 0.8298596278765852

predict¶

In [21]:
data_predict = data.dropna(subset=['author_birth_year', 'author_death_year', 'first_anth_year']).copy()
data_predict['year_predict'] = [get_year_middle(element) for element in data_predict.iloc]
data_predict = data_predict.reset_index(drop=True)
year_predict_dic = dict(zip(data_predict['author_title'], data_predict['year_predict']))
In [22]:
for i, element in enumerate(meta.iloc):
    this_author_title = element['author_title']

    if this_author_title in data_predict['author_title'].tolist():
        meta.at[i, 'year_predict_middle'] = year_predict_dic[this_author_title]

Approach 3: machine learning¶

compare models¶

In [23]:
scoring_functions = ['neg_mean_absolute_error', 'neg_mean_squared_error', 'r2']

def eval_model (model, features=possible_features):
    results = pd.DataFrame()

    scores = cross_validate(model, X[features], y, cv=kf, scoring=scoring_functions)
    results.at['cv_complete_set', 'mean_absolute_error'] = np.mean(-scores['test_neg_mean_absolute_error'])
    results.at['cv_complete_set', 'mean_squared_error'] = np.mean(-scores['test_neg_mean_squared_error'])
    results.at['cv_complete_set', 'r2'] = np.mean(scores['test_r2'])

    scores = cross_validate(model, X_train[features], y_train, cv=kf, scoring=scoring_functions)
    results.at['cv_train_set', 'mean_absolute_error'] = np.mean(-scores['test_neg_mean_absolute_error'])
    results.at['cv_train_set', 'mean_squared_error'] = np.mean(-scores['test_neg_mean_squared_error'])
    results.at['cv_train_set', 'r2'] = np.mean(scores['test_r2'])

    model.fit(X_train[features], y_train)
    y_test_predict = model.predict(X_test[features])
    results.at['test_set', 'mean_absolute_error'] = mean_absolute_error(y_test_predict, y_test)
    results.at['test_set', 'mean_squared_error'] = mean_squared_error(y_test_predict, y_test)
    results.at['test_set', 'r2'] = r2_score(y_test_predict, y_test)

    return results
In [24]:
def get_errors (model, features=possible_features, mode='cv_complete_set'):
    if mode == 'cv_complete_set':
        y_predict = cross_val_predict(model, X[features], y, cv=kf)
        errors = np.array(y-y_predict)
    elif mode == 'cv_train_set':
        y_predict = cross_val_predict(model, X_train[features], y_train, cv=kf)
        errors = np.array(y_train-y_predict)
    elif mode == 'test_set':
        model.fit(X_train[features], y_train)
        y_predict = model.predict(X_test[features])
        errors = np.array(y_test-y_predict)
        
    return errors

def visualize_errors (errors):
    fig = px.histogram(errors, labels={'value': 'error'})
    fig.update_layout(showlegend=False)
    fig.show()

    errors_mean = np.mean(errors)
    errors_mean_abs = np.mean(np.abs(errors))
    print(f"mean_absolute_error : {round(errors_mean_abs, 4)}")
    print(f"mean_error          : {round(errors_mean, 4)}")
In [25]:
models = {
    'linear_regressor' : LinearRegression(),
    'ridge_regressor' : Ridge(),
    'ard_regressor' : ARDRegression(),
    'kernel_ridge_regressor' : KernelRidge(),
    'kneighbors_regressor' : KNeighborsRegressor(),
    'adaboost_regressor' : AdaBoostRegressor(random_state=42),
    'bagging_regressor' : BaggingRegressor(random_state=42),
    'gradient_boosting_regressor' : GradientBoostingRegressor(random_state=42),
    'hist_gradient_boosting_regressor' : HistGradientBoostingRegressor(random_state=42),    
    'random_forest_regressor' : RandomForestRegressor(random_state=42),
    'mlp_regressor' : MLPRegressor(random_state=42, max_iter=1000),
}
In [26]:
for model_name in tqdm(models):
    results = eval_model(models[model_name])

    print(f"\n\n{model_name}")
    print(results)
  0%|          | 0/11 [00:00<?, ?it/s]

linear_regressor
                 mean_absolute_error  mean_squared_error        r2
cv_complete_set             9.060317          124.456057  0.860107
cv_train_set                9.041652          123.670930  0.862956
test_set                    9.178702          127.921145  0.809713


ridge_regressor
                 mean_absolute_error  mean_squared_error        r2
cv_complete_set             9.060317          124.456054  0.860107
cv_train_set                9.041651          123.670922  0.862956
test_set                    9.178705          127.921176  0.809713


ard_regressor
                 mean_absolute_error  mean_squared_error        r2
cv_complete_set             9.063494          124.434870  0.860128
cv_train_set                9.042844          123.615798  0.863013
test_set                    9.182157          127.940928  0.809636


kernel_ridge_regressor
                 mean_absolute_error  mean_squared_error        r2
cv_complete_set             9.387051          131.611980  0.851872
cv_train_set                9.390244          131.541098  0.854253
test_set                    9.439567          133.774931  0.832859


kneighbors_regressor
                 mean_absolute_error  mean_squared_error        r2
cv_complete_set             7.137378          105.023682  0.883156
cv_train_set                7.416666          113.395689  0.875121
test_set                    7.430928          104.270987  0.854032


adaboost_regressor
                 mean_absolute_error  mean_squared_error        r2
cv_complete_set             8.792977          126.577010  0.858701
cv_train_set                8.736359          125.461120  0.861234
test_set                    9.006384          126.371332  0.802456


bagging_regressor
                 mean_absolute_error  mean_squared_error        r2
cv_complete_set             6.180390           90.009922  0.899971
cv_train_set                6.214439           90.131272  0.900602
test_set                    6.234904           85.580274  0.884559


gradient_boosting_regressor
                 mean_absolute_error  mean_squared_error        r2
cv_complete_set             7.037911           90.776419  0.898913
cv_train_set                7.059796           91.343437  0.899390
test_set                    7.274541           90.095518  0.871088


hist_gradient_boosting_regressor
                 mean_absolute_error  mean_squared_error        r2
cv_complete_set             6.500676           85.286779  0.905082
cv_train_set                6.528512           87.581662  0.903757
test_set                    6.828010           87.893407  0.880195


random_forest_regressor
                 mean_absolute_error  mean_squared_error        r2
cv_complete_set             6.028631           84.411729  0.906031
cv_train_set                6.115095           86.433683  0.904782
test_set                    6.108529           82.338954  0.887943


mlp_regressor
                 mean_absolute_error  mean_squared_error        r2
cv_complete_set             9.870442          147.816524  0.834483
cv_train_set               10.016732          153.006094  0.831131
test_set                    9.681776          138.974099  0.824130
In [27]:
random_forest_regressor = RandomForestRegressor(random_state=42)
eval_model(random_forest_regressor)
pd.DataFrame({'feature' : possible_features, 'importance' : random_forest_regressor.feature_importances_})
Out[27]:
feature importance
0 author_birth_year 0.862607
1 author_death_year 0.056364
2 first_anth_year 0.051383
3 mean_anth_year 0.013611
4 last_anth_year 0.010950
5 text_count 0.005086
In [28]:
errors = get_errors(random_forest_regressor, mode='cv_complete_set')
visualize_errors(errors)
mean_absolute_error : 6.0286
mean_error          : -0.0307

test random states¶

In [29]:
random_forest_regressor = RandomForestRegressor(random_state=42)
In [30]:
results = pd.DataFrame()

for i in tqdm(range(30)):
    kf_rs = KFold(n_splits=5, shuffle=True, random_state=i)   
    scores = cross_validate(random_forest_regressor, X, y, cv=kf_rs, scoring=('neg_mean_absolute_error', 'neg_mean_squared_error'))
    results.at[i, 'cv_mae'] = np.mean(-scores['test_neg_mean_absolute_error'])
    results.at[i, 'cv_mse'] = np.mean(-scores['test_neg_mean_squared_error'])

    X_train_rs, X_test_rs, y_train_rs, y_test_rs = train_test_split(X, y, test_size=0.2, random_state=i, shuffle=True)
    random_forest_regressor.fit(X_train_rs, y_train_rs)
    y_test_predict = random_forest_regressor.predict(X_test_rs)
    results.at[i, 'mae'] = mean_absolute_error(y_test_predict, y_test_rs)
    results.at[i, 'mse'] = mean_squared_error(y_test_predict, y_test_rs)
  0%|          | 0/30 [00:00<?, ?it/s]
In [31]:
px.box(results, y = ['cv_mae', 'mae'], points='all', labels={'variable':'', 'value':''}, hover_name=results.index).show()
px.box(results, y = ['cv_mse', 'mse'], points='all', labels={'variable':'', 'value':''}, hover_name=results.index).show()

improve model¶

https://www.kaggle.com/code/marcinrutecki/gridsearchcv-kfold-cv-the-right-way

use only training set for tuning, evaluate on test set later

In [32]:
preferred_scoring = 'mean_squared_error'

get best features¶

(exhaustive search)

In [33]:
def all_combinations(items):
    result = []
    for r in range(1, len(items) + 1):
        combinations = itertools.combinations(items, r)
        result.extend(combinations)
    return result

possible_feature_combinations = all_combinations(possible_features)
possible_feature_combinations = [list(combination) for combination in possible_feature_combinations]
In [34]:
feature_combinations_scores = pd.DataFrame()

for i, feature_combination in enumerate(tqdm(possible_feature_combinations)):
    scores = cross_validate(
        random_forest_regressor, 
        X_train[feature_combination], 
        y_train, 
        cv=kf,
        scoring=scoring_functions,
    )

    mae = np.mean(-scores['test_neg_mean_absolute_error'])
    mse = np.mean(-scores['test_neg_mean_squared_error'])
    r2 = np.mean(scores['test_r2'])

    for feature in possible_features:
        feature_combinations_scores.at[i, feature] = 1 if feature in feature_combination else 0
    feature_combinations_scores.at[i, 'mean_absolute_error'] = mae
    feature_combinations_scores.at[i, 'mean_squared_error'] = mse
    feature_combinations_scores.at[i, 'r2'] = r2
  0%|          | 0/63 [00:00<?, ?it/s]
In [35]:
best_feature_combination = feature_combinations_scores.sort_values(by=preferred_scoring).head(1)
best_feature_combination = best_feature_combination.columns[best_feature_combination.iloc[0] == 1].tolist()
In [36]:
pd.concat([
    feature_combinations_scores.sort_values(by=preferred_scoring).head(5), 
    feature_combinations_scores.sort_values(by=preferred_scoring).tail(5)
])
Out[36]:
author_birth_year author_death_year first_anth_year mean_anth_year last_anth_year text_count mean_absolute_error mean_squared_error r2
42 1.0 1.0 1.0 0.0 1.0 0.0 6.053746 85.468251 0.905770
58 1.0 1.0 1.0 0.0 1.0 1.0 6.084587 85.804644 0.905456
62 1.0 1.0 1.0 1.0 1.0 1.0 6.115095 86.433683 0.904782
56 1.0 1.0 1.0 1.0 1.0 0.0 6.106740 86.580804 0.904561
21 1.0 1.0 1.0 0.0 0.0 0.0 5.984260 86.834208 0.904350
19 0.0 0.0 0.0 1.0 0.0 1.0 15.004678 415.485917 0.542472
20 0.0 0.0 0.0 0.0 1.0 1.0 15.502078 440.388709 0.513857
3 0.0 0.0 0.0 1.0 0.0 0.0 15.889946 468.937691 0.482761
4 0.0 0.0 0.0 0.0 1.0 0.0 17.128785 537.446106 0.406882
5 0.0 0.0 0.0 0.0 0.0 1.0 23.111350 811.555413 0.108981

get best params¶

In [37]:
param_grid = {
    'n_estimators': [100, 400, 800, 1000],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

grid_search = GridSearchCV(
    estimator=random_forest_regressor, 
    param_grid=param_grid,
    cv=kf, 
    scoring='neg_'+preferred_scoring,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(
    X_train[best_feature_combination],
    y_train
)
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Out[37]:
GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
             estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [10, 20, 30], 'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 5],
                         'n_estimators': [100, 400, 800, 1000]},
             scoring='neg_mean_squared_error', verbose=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
             estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [10, 20, 30], 'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 5],
                         'n_estimators': [100, 400, 800, 1000]},
             scoring='neg_mean_squared_error', verbose=1)
RandomForestRegressor(max_depth=10, min_samples_split=5, n_estimators=400,
                      random_state=42)
RandomForestRegressor(max_depth=10, min_samples_split=5, n_estimators=400,
                      random_state=42)
In [38]:
best_params = grid_search.best_params_
best_score = -grid_search.best_score_

print("Feature importance :", grid_search.best_estimator_.feature_importances_)
print("Best CV score      :", best_score)
Feature importance : [0.88777482 0.05129049 0.04988355 0.01105114]
Best CV score      : 83.0962694685561
In [39]:
print(f"best feature combination : {best_feature_combination}")
print(f"best parameters          : {best_params}")
best feature combination : ['author_birth_year', 'author_death_year', 'first_anth_year', 'last_anth_year']
best parameters          : {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 400}

eval improved model¶

In [40]:
# only results for test set are informative
best_random_forest_regressor = RandomForestRegressor(**best_params, random_state=42)
eval_model(best_random_forest_regressor, features=best_feature_combination)
Out[40]:
mean_absolute_error mean_squared_error r2
cv_complete_set 6.226239 81.173910 0.909564
cv_train_set 6.271163 83.096269 0.908548
test_set 6.403285 81.055716 0.887285

predict¶

In [41]:
# vanilla
final_feature_combination = possible_features
final_params = RandomForestRegressor(random_state=42).get_params()
X_train_final = X
y_train_final = y

# improved
# final_feature_combination = best_feature_combination
# final_params = best_params
# final_params.update({"random_state":42})
# X_train_final = X_train
# y_train_final = y_train
In [42]:
# create model
random_forest_regressor = RandomForestRegressor(**final_params)
random_forest_regressor.fit(X_train_final[final_feature_combination], y_train_final)
Out[42]:
RandomForestRegressor(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor(random_state=42)
In [43]:
# apply model to data
data_predict = data.dropna(subset=final_feature_combination).copy()
data_predict['year_predict'] = random_forest_regressor.predict(data_predict[final_feature_combination])
data_predict = data_predict.reset_index(drop=True)
year_predict_dic = dict(zip(data_predict['author_title'], data_predict['year_predict']))
In [44]:
for i, element in enumerate(meta.iloc):
    this_author_title = element['author_title']

    if this_author_title in data_predict['author_title'].tolist():
        meta.at[i, 'year_predict_rfr'] = year_predict_dic[this_author_title]

Combine Predictions¶

In [45]:
rfr_available = meta.query("year_predict_rfr.notna()").shape[0]
middle_available = meta.query("year_predict_middle.notna()").shape[0]
ages_mean_available = meta.query("year_predict_ages_mean.notna()").shape[0]

print(f"all texts     : {meta.shape[0]}\n")

print(f"Available Predictions")
print(f"random forest : {rfr_available}")
print(f"year middle   : {middle_available}")
print(f"ages mean     : {ages_mean_available}")
all texts     : 21303

Available Predictions
random forest : 18148
year middle   : 18148
ages mean     : 18428
In [46]:
earliest_anthology_year_dict = meta.groupby('author_title')['anthology_year_used_ed'].min().to_dict()
year_gt_filled = meta.groupby('author_title')['year_gt'].transform(lambda x: x.ffill().bfill()) 

for i, element in enumerate(meta.iloc):
    final_year = float('NaN')
    
    this_author_title = element['author_title']
    
    this_year_gt = year_gt_filled.iloc[i]
    this_year_predict_rfr = element['year_predict_rfr']
    this_year_predict_ages_mean = element['year_predict_ages_mean']
    
    this_birth_year = element['author_birth']
    this_death_year = element['author_death']
    this_earliest_anthology_year = earliest_anthology_year_dict[this_author_title]

    if element.id == '1912.Werner.150':
        print()
    
    if pd.notna(this_year_gt):
        final_year = this_year_gt
    elif pd.notna(this_year_predict_rfr):
        final_year = round(this_year_predict_rfr)
    elif pd.notna(this_year_predict_ages_mean):
        final_year = round(this_year_predict_ages_mean)

    # check (nicht vor Geburtsdatum, nicht nach Tod, nicht nach erster Anthologie)
    this_min_possible = this_birth_year
    this_max_possible = np.nanmin([this_death_year, this_earliest_anthology_year])
    
    if final_year < this_min_possible:
        final_year = this_min_possible
    elif final_year > this_max_possible:
        final_year = this_max_possible
    
    meta.at[i, 'year'] = final_year

Check¶

In [47]:
print("year_gt")
print(f"year_gt (count)     : {meta['year_gt'].dropna().shape[0]}")
print(f"year_gt (mean)      : {meta['year_gt'].mean()}")
print(f"ages (mean)         : {(meta['year_gt'].dropna()-meta['author_birth'].dropna()).mean()}")

print("\nages_mean")
print(f"year_predict (mean) : {meta.query('year_gt.notna()')['year_predict_ages_mean'].mean()}")
print(f"Mean Absolute Error : {np.mean(abs(meta['year_predict_ages_mean']-meta['year_gt']))}")
print(f"Mean Squared Error  : {np.mean(abs(meta['year_predict_ages_mean']-meta['year_gt'])**2)}")

print("\nrandom forest")
print(f"year_predict (mean) : {meta.query('year_gt.notna()')['year_predict_rfr'].mean()}")
print(f"Mean Absolute Error : {np.mean(abs(meta['year_predict_rfr']-meta['year_gt']))}")
print(f"Mean Squared Error  : {np.mean(abs(meta['year_predict_rfr']-meta['year_gt'])**2)}")
year_gt
year_gt (count)     : 3507
year_gt (mean)      : 1871.9709153122326
ages (mean)         : 39.2149236531259

ages_mean
year_predict (mean) : 1872.4391506296618
Mean Absolute Error : 9.896605791689172
Mean Squared Error  : 147.65328658988872

random forest
year_predict (mean) : 1872.3506071709696
Mean Absolute Error : 3.1869898798383254
Mean Squared Error  : 27.88486060124036
In [48]:
# Fehlende Datierungen?
pd.set_option('display.width', 1000)
modcanon_authors = ['Hofmannsthal, Hugo von', 'Rilke, Rainer Maria', 'George, Stefan', 'Heym, Georg']
muench_authors = ['Münchhausen, Börries von', 'Miegel, Agnes', 'Strauß und Torney, Lulu von']

results_searched = (
    meta
    .query("1850 <= year <= 1918")
    .query("corpus == 'anth' or author in @modcanon_authors or author in @muench_authors")
    .query("year_search_status != 'not_searched'")
    .sort_values(by = 'year')
)

results_notsearched = (
    meta
    .query("1850 <= year <= 1918")
    .query("corpus == 'anth' or author in @modcanon_authors or author in @muench_authors")
    .query("year_search_status == 'not_searched'")
    .drop_duplicates(subset='author_title')
    .sort_values(by = 'year')
)
results_notsearched = results_notsearched[~results_notsearched['author_title'].isin(results_searched['author_title'])]

if results_notsearched.shape[0] == 0:
    print('Keine fehlenden Datierungen')
else:
    print('Fehlende Datierungen:\n')
    print(results_notsearched[['id', 'author', 'title', 'year', 'year_gt', 'corpus']])

test_ids = ['1892/93.Tetzner.1.067', '1891.Brümmer.521']
test_results = meta.query("id.isin(@test_ids)")
print("\nTests:")
print(test_results[['id', 'author', 'title', 'year', 'year_gt', 'year_predict_ages_mean', 'year_predict_rfr']])
Keine fehlenden Datierungen

Tests:
                          id          author                         title    year  year_gt  year_predict_ages_mean  year_predict_rfr
13395       1891.Brümmer.521  Döring, Moritz  Die weiße Kuh von Courcelles  1850.0      NaN             1837.252461          1849.705
13789  1892/93.Tetzner.1.067     Dahn, Felix                      Gotenzug  1876.0   1876.0             1873.252461          1875.150